# Load the libraries 
library(tidyverse)
library(plotly)
library(broom)
library(knitr)

This mini-project demands a good understanding of the statistical test. For more information about statistical testing, please refer to this link. In this mini-project, the task is to analyze gapminder_clean.csvdataset using Rand tidyverse. For this mini-project, the p-value decision rule is set to be \(\alpha = 0.05\).

Part 1

Complete the following analysis in R and generate a RMarkdownreport to show the analysis and results:

Question 1

Read in the gapminder_clean.csv data as a tibble using read_csv.

gapminder <- read_csv("gapminder_clean.csv")

# Rename column names of gapminder dataset. 
gapminder <- gapminder %>% rename(co2_emission = `CO2 emissions (metric tons per capita)`, country_name =`Country Name`, 
energy_use = `Energy use (kg of oil equivalent per capita)`, imports_services = `Imports of goods and services (% of GDP)`, 
population_density = `Population density (people per sq. km of land area)`, 
life_expectancy = `Life expectancy at birth, total (years)`)

Question 2

Filter the data to include only rows where Year is 1962 and then make a scatter plot comparing CO2 emissions (metric tons per capita) and gdpPercap for the filtered data.

# Filter gapminder dataset with year 1962 and scatterplot (in logarithmic scale) comparing CO2 emissions and gdpPercap
gapminder %>%
  filter(Year == 1962) %>% 
  ggplot(aes(x = gdpPercap, y = co2_emission)) + 
  geom_point(alpha = 0.3) + scale_x_log10(name = "GDP per capita (log10)") + 
  scale_y_log10(name = "CO2 emissions metric tons per capita (log10)") + labs(title="CO2 emissions vs GDP in 1962 (Logarithmic Scale)") + theme_classic()

Question 3

On the filtered data, calculate the pearson correlation of CO2 emissions (metric tons per capita) and gdpPercap. What is the Pearson R value and associated p value?

# Filter the data
gapminder_1962 <- gapminder %>%
  filter(Year == 1962)

# Correlation test 
test_result <- tidy(cor.test(gapminder_1962$co2_emission,gapminder_1962$gdpPercap))
test_result
## # A tibble: 1 × 8
##   estimate statistic  p.value parameter conf.low conf.high method    alternative
##      <dbl>     <dbl>    <dbl>     <int>    <dbl>     <dbl> <chr>     <chr>      
## 1    0.926      25.3 1.13e-46       106    0.893     0.949 Pearson'… two.sided

From the table above, the correlation between CO2 emissions and gdpPercap is 0.9260817 and the associated p value is 1.1286792^{-46}.

Question 4

On the unfiltered data, answer “In what year is the correlation between CO2 emissions (metric tons per capita) and gdpPercap the strongest?” Filter the dataset to that year for the next step…

# Correlation for all years
gap_cor <- gapminder %>%
  group_by(Year) %>%
  summarize(correlation = cor(co2_emission, gdpPercap, use = "complete.obs")) %>%
  arrange(desc(correlation))

gap_cor
## # A tibble: 10 × 2
##     Year correlation
##    <dbl>       <dbl>
##  1  1967       0.939
##  2  1962       0.926
##  3  1972       0.843
##  4  1982       0.817
##  5  1987       0.810
##  6  1992       0.809
##  7  1997       0.808
##  8  2002       0.801
##  9  1977       0.793
## 10  2007       0.720

From the table above, the correlation between CO2 emissions and gdpPercap is the strongest at year 2007 with a correlation value of 0.9387918.

Question 5

Using plotly, create an interative scatter plot comparing CO2 emissions (metric tons per capita) and gdpPercap, where the point size is determined by pop (population) and the color is determined by the continent. You can easily convert any ggplot to a plotly plot using the ggplotly() command.

# Scatterplot for year with strongest correlation between CO2 emissions and gdpPercap
gapminder_scatter_P1Q2 <- gapminder %>%
  filter(Year == 1967, !is.na(continent)) %>%
  ggplot(aes(x = gdpPercap, y = co2_emission, size = pop, color = continent)) + geom_point() + scale_x_log10(name = "GDP per capita (log10)") +
  scale_y_log10("CO2 emissions metric tons per capita (log10)") + guides(color = guide_legend(order=2), size = guide_legend(order = 1)) + 
  labs(title="CO2 emissions vs GDP in 1962 (Logarithmic Scale)") + theme_classic()

ggplotly(gapminder_scatter_P1Q2, width = 800, height = 500)

Part 2

Now, without further guidance, use your R Data Science skills (and appropriate statistical tests) to answer the following:

Question 1

What is the relationship between continent and 'Energy use (kg of oil equivalent per capita)' ? (stats test needed)

# Filter the continent column to avoid missing values
gapminder_continent_filter <- gapminder %>% 
  filter(!is.na(continent))

# Boxplot comparing energy use between continents
ggplot(gapminder_continent_filter, aes(x = continent, y = energy_use)) + geom_boxplot() + labs(title="Relationships between Energy use and Continent") +
  scale_x_discrete(name = 'Continent') + scale_y_continuous(name = 'Energy use (kg of oil equivalent per capita)') + theme_classic()

The boxplot above describes the difference in energy use for different continents. We first use one-way ANOVA test to verify whether this difference is real or it is due to random noise like sampling error.

# p-value outcome from one-way ANOVA test
aov_test_result <- tidy(aov(energy_use~continent, data = gapminder_continent_filter))

aov_test_result
## # A tibble: 2 × 6
##   term         df       sumsq     meansq statistic   p.value
##   <chr>     <dbl>       <dbl>      <dbl>     <dbl>     <dbl>
## 1 continent     4  771482483. 192870621.      51.5  8.53e-39
## 2 Residuals   843 3159591816.   3748033.      NA   NA

From the table above, the p-value for this test is 8.5270035^{-39} and thus it is statistically significant.

one-way ANOVA test only tells us whether there is a significant difference between continents concerning energy usage, but we do not know where these differences occur. To further verify this, we need to perform a post-hoc test, for instance, Tukey’s test to check statistical significance for all possible combinations of two out of all continents.

# Tukey test
tukey_test <- TukeyHSD(aov(energy_use~as.factor(continent), data = gapminder_continent_filter))

# Filter the combinations with p-value >0.05
filtered_tukey_test <- tidy(tukey_test) %>% filter(adj.p.value >= 0.05)

tukey_test
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = energy_use ~ as.factor(continent), data = gapminder_continent_filter)
## 
## $`as.factor(continent)`
##                       diff       lwr       upr     p adj
## Americas-Africa  1005.1037  466.8326 1543.3748 0.0000041
## Asia-Africa      1168.7636  628.2529 1709.2742 0.0000000
## Europe-Africa    2447.5453 1947.3838 2947.7067 0.0000000
## Oceania-Africa   3281.7976 2040.3410 4523.2543 0.0000000
## Asia-Americas     163.6599 -384.4160  711.7357 0.9256447
## Europe-Americas  1442.4416  934.1141 1950.7691 0.0000000
## Oceania-Americas 2276.6940 1031.9249 3521.4630 0.0000069
## Europe-Asia      1278.7817  768.0833 1789.4801 0.0000000
## Oceania-Asia     2113.0341  867.2950 3358.7732 0.0000402
## Oceania-Europe    834.2524 -394.5176 2063.0223 0.3421942

From the table above, we can see that only Asia-Americas, Oceania-Europe combinations are not statistically significant whereas the others do.

Question 2

Is there a significant difference between Europe and Asia with respect to 'Imports of goods and services (% of GDP)' in the years after 1990? (stats test needed)

# Filter Asia and Europe out from continent column
gapminder_Asia_Europe <- gapminder %>% filter(continent %in% c('Europe','Asia'))

# Perform t test
t_test <- t.test(gapminder_Asia_Europe$imports_services~gapminder_Asia_Europe$continent)

df_t_test <- tidy(t_test)
t_test 
## 
##  Welch Two Sample t-test
## 
## data:  gapminder_Asia_Europe$imports_services by gapminder_Asia_Europe$continent
## t = 1.9982, df = 281.72, p-value = 0.04665
## alternative hypothesis: true difference in means between group Asia and group Europe is not equal to 0
## 95 percent confidence interval:
##   0.079458 10.573429
## sample estimates:
##   mean in group Asia mean in group Europe 
##             40.36486             35.03841

For this question, I performed two-sided unpaired t-test. From the table, the resulting p-value is 0.0466528. So I conclude that the difference is statistically significant.

Question 3

What is the country (or countries) that has the highest 'Population density (people per sq. km of land area)' across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)

# gapminder pop density ranking by country across all years
gapminder_ranking_by_year <- gapminder %>%
  group_by(Year) %>%
  arrange(desc(population_density)) %>%
  mutate(ranking = row_number())

# gapminder average pop density ranking by country across all years
gapminder_avg_rank <- gapminder_ranking_by_year %>%
  group_by(country_name) %>% 
  summarize(avg_rank = mean(ranking)) %>%
  arrange(avg_rank)

# countr/countries with best average pop density ranking
best_rank <- gapminder_avg_rank %>% 
  filter(avg_rank == min(avg_rank))

gapminder_avg_rank
## # A tibble: 263 × 2
##    country_name         avg_rank
##    <chr>                   <dbl>
##  1 Macao SAR, China          1.5
##  2 Monaco                    1.5
##  3 Hong Kong SAR, China      3.1
##  4 Singapore                 3.9
##  5 Gibraltar                 5  
##  6 Bermuda                   6.2
##  7 Malta                     7  
##  8 Bangladesh                9.2
##  9 Channel Islands           9.4
## 10 Maldives                 10.7
## # … with 253 more rows

From the table above, Macao SAR, China, Monaco has the highest average ranking of 1.5 (lower is better) in population density across all years.

Question 4

What country (or countries) has shown the greatest increase in 'Life expectancy at birth, total (years)' since 1962?

# life expectancy increase by country across all years
gapminder_overall_lifeExp_increase <- gapminder %>% 
  group_by(country_name) %>% 
  summarize(overall_increase = tail(life_expectancy, n = 1) - head(life_expectancy, n = 1)) %>%
  filter(!is.na(overall_increase)) %>% 
  arrange(desc(overall_increase))

# country/countries with highest life expectancy increase
gapminder_greatest_lifeExp_increase <- gapminder_overall_lifeExp_increase %>% 
  filter(overall_increase == max(overall_increase))

gapminder_overall_lifeExp_increase
## # A tibble: 237 × 2
##    country_name       overall_increase
##    <chr>                         <dbl>
##  1 Maldives                       36.9
##  2 Bhutan                         33.2
##  3 Timor-Leste                    31.1
##  4 Tunisia                        30.9
##  5 Oman                           30.8
##  6 Nepal                          30.6
##  7 China                          29.9
##  8 Yemen, Rep.                    27.2
##  9 Saudi Arabia                   26.7
## 10 Iran, Islamic Rep.             26.6
## # … with 227 more rows

From the table above, Maldives showed the greatest increase of 36.9161463 in life expectancy since 1962.